{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f802e64d-4eaa-445d-a48a-1042a91bc394",
   "metadata": {
    "tags": []
   },
   "source": [
    "# 智能问答机器人：简历筛选小助手\n",
    "\n",
    "## 目标\n",
    "通过对本地简历库的简历进行解析、切片、创建索引，实现基于JD进行简历筛选，并对筛选的Top1简历进行总结的功能\n",
    "\n",
    "## 准备工作\n",
    "### 平台注册\n",
    "1.先在appbuilder平台注册，获取token\n",
    "2.创建BES集群，详见(https://cloud.baidu.com/doc/BES/s/0jwvyk4tv)\n",
    "3.安装appbuilder-sdk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2939356f-61c2-42e9-9e0c-fc6729c193f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install appbuilder-sdk\n",
    "!pip install elasticsearch==7.11.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ccff03b-1567-4e8b-8e1f-9a5032690406",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import os\n",
    "\n",
    "#  设置环境变量\n",
    "os.environ[\"APPBUILDER_TOKEN\"] = \"...\"\n",
    "\n",
    "# 创建BES集群后获得集群id、用户名、密码\n",
    "cluster_id = \"...\"\n",
    "username = \"...\"\n",
    "password = \"...\"\n",
    "\n",
    "print(\"init done\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aeb2fa55-075f-48df-a9fb-8b40d9900684",
   "metadata": {},
   "source": [
    "## 开发过程\n",
    "### 对简历文档内容进行解析、切分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41559341-fd7a-478c-a08b-1477d79e9d41",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-12-18T06:24:26.982459Z",
     "start_time": "2023-12-18T06:23:53.771345Z"
    }
   },
   "outputs": [],
   "source": [
    "import appbuilder\n",
    "from pathlib import Path\n",
    "\n",
    "# 基于doc_parser和doc_splitter解析file_path文件为若干个段落\n",
    "def parse_file(file_path, doc_parser, doc_splitter):\n",
    "    input_msg = appbuilder.Message(str(file_path))\n",
    "    doc_parser_result = doc_parser(input_msg, return_raw=True)\n",
    "    doc_splitter_result = doc_splitter(doc_parser_result)\n",
    "    return [f\"{file_path.name}+{para['text'][:384]}\" \n",
    "            for para in doc_splitter_result.content[\"paragraphs\"]]\n",
    "\n",
    "# 文档切分的分块大小，每个分块最大340个字符\n",
    "chunk_size = 340\n",
    "\n",
    "# 本地简历库地址\n",
    "file_dir = \"../简历\"\n",
    "\n",
    "# 声明文档解析和文档切分组件\n",
    "doc_parser = appbuilder.DocParser()\n",
    "doc_splitter = appbuilder.DocSplitter(splitter_type=\"split_by_chunk\", max_segment_length=chunk_size)       \n",
    "\n",
    "# 批量解析，形成段落切片列表\n",
    "resume_paragraphs = [para_text \n",
    "            for file in Path(file_dir).iterdir() if file.is_file()\n",
    "            for para_text in parse_file(file, doc_parser, doc_splitter)]\n",
    "\n",
    "print(f\"total length of splitted paragraphs {len(resume_paragraphs)}\")\n",
    "\n",
    "print(\"sample resume paragraphs\")\n",
    "for paras in resume_paragraphs[:3]:\n",
    "    print(paras)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5fc5bc38-6bc5-4187-a8fd-f802d77d89fa",
   "metadata": {},
   "source": [
    "### 计算文档切片的语义向量并入库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d514a628-5ae3-4269-aada-eaf8b21c3793",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "# 千帆Embedding\n",
    "embedding = appbuilder.Embedding()\n",
    "\n",
    "# 将段落切片列表入库到BESVectorStoreIndex，这里面用到的Baidu Elastic Search服务\n",
    "segments = appbuilder.Message(resume_paragraphs)\n",
    "vector_index = appbuilder.BESVectorStoreIndex.from_segments(\n",
    "    segments=segments, cluster_id=cluster_id, user_name=username, \n",
    "    password=password, embedding=embedding)\n",
    "\n",
    "# bes向量入库是异步操作，此处等待bes进行refresh\n",
    "time.sleep(5)\n",
    "print(\"index done.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16a8aa38-7a33-4e27-bca4-00900cfe1641",
   "metadata": {},
   "source": [
    "### 从工作职责描述中抽取关键词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f45ef5f-6206-4b31-83c4-3c8eb2c86925",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 招聘jd\n",
    "from collections import Counter, defaultdict\n",
    "\n",
    "# 工作职责描述\n",
    "job_desc = '''拥有大语言模型领域的研究背景，发表过期刊会议论文者加分；\n",
    "具备深度学习平台开发或大模型开发经验，能够独立完成任务；\n",
    "具备优秀的编程能力，熟悉tensorflow、pytorch、paddlepaddle中的一种深度学习框架；\n",
    "对人工智能有浓厚兴趣，愿意投入时间和精力深入研究；\n",
    "具备团队协作精神，能够与团队成员有效沟通，共同推进项目进展。'''\n",
    "\n",
    "# 标签抽取的组件\n",
    "tagger = appbuilder.TagExtraction(model=\"eb-turbo-appbuilder\")\n",
    "\n",
    "# 从JD抽取标签并打印\n",
    "tags = tagger(appbuilder.Message(job_desc))\n",
    "\n",
    "print(\"从JD抽取标签并打印\")\n",
    "print(tags.content)\n",
    "tag_list = [tag.split(\".\")[1] for tag in tags.content.split(\"\\n\")]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21ef57a9-46cf-4939-b375-68e21d4b3e90",
   "metadata": {},
   "source": [
    "### 基于工作描述的关键词从简历库中检索简历并排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7313e122-5199-4c90-bc6c-ad04e206ccc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 基于JD抽取出来的标签，使用retriever对简历库进行检索，基于简历文件名称命中次数进行汇总和排序\n",
    "retriever = vector_index.as_retriever()\n",
    "\n",
    "resume_count = Counter()\n",
    "resume_content = defaultdict(set)\n",
    "\n",
    "for tag in tag_list:\n",
    "    print(\"Going to retrieve resumes for tag: \", tag, \"\\nGot Results\")\n",
    "    relevant_resumes = retriever(query=appbuilder.Message(tag), top_k=3)\n",
    "    for idx, res in enumerate(relevant_resumes.content):\n",
    "        name_and_content = res[\"text\"].split(\"+\")\n",
    "        print(f\"{idx+1}: {name_and_content[0]}, {name_and_content[1]}\")\n",
    "        resume_count[name_and_content[0]] += 1\n",
    "        resume_content[name_and_content[0]].add(name_and_content[1])\n",
    "    print()\n",
    "\n",
    "sorted_resumes = sorted(resume_count.items(), key=lambda x: x[1], reverse=True)\n",
    "print(sorted_resumes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36ba212a-774b-4f26-a860-988a872423ac",
   "metadata": {},
   "source": [
    "### 对检索得到的最优简历进行总结"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e38701cd-9fbf-4da4-982b-a87a9b310427",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "play = appbuilder.Playground(\n",
    "    prompt_template=\"基于候选人姓名、职责描述和简历内容，概括一下{name}的推荐理由。\\n候选人姓名: {name}\\n职责描述: {JD}\\n简历内容: {resume}\\n推荐理由: \",\n",
    "    model=\"ernie-bot-4\"\n",
    ")\n",
    "\n",
    "resume_summary = play(appbuilder.Message({\"JD\": job_desc, \"name\": sorted_resumes[0][0], \"resume\": \"\\n\".join(list(resume_content[sorted_resumes[0][0]]))}))\n",
    "print(resume_summary.content)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
